* This do-file documents how the following 3 datasets were created:
*
* 1. term_disc_merged.dta
* 	All utility patents issued between Jan. 1, 1990 and June 6, 2023
* 	Data sources: USPTO PatentsView and PatEx datasets
*
* 2. litigated_patents_term_disc_no_dups.dta
*	Litigated patents within our date range (with duplicates dropped)
*	Data sources: term_disc_merged.dta and USPTO litigated patent data
*
* 3. nda_patents_term_disc_no_dups.dta
*	FDA Orange Book patents within our date range (with duplicates dropped)
*	Data sources: term_disc_merged.dta and Ouellette & Sampat Orange Book data


************************************
* PatentsView Terminal Disclaimers *
************************************

* https://patentsview.org/download/data-download-tables
* https://patentsview.org/download/data-download-dictionary
* Most recent update: Mar. 31, 2024
* Download g_patent, g_application, and g_us_term_of_grant, convert to .dta

use "g_patent.dta", clear
drop patent_title wipo_kind filename withdrawn
label variable patent_id "US Patent No. (unique)"
label variable num_claims "Number of Claims (PatentsView)"

* Convert date format and drop patents before Jan. 1, 1990 or after Jun. 6, 2023
gen patent_date2 = date(patent_date, "YMD")
format patent_date2 %td
drop patent_date
rename patent_date2 patent_date
drop if patent_date < td(01jan1990)
drop if patent_date > td(06jun2023)
label variable patent_date "Patent Issue Date"

* Keep only utility patents
drop if substr(patent_id, 1, 1) == "D"
drop if substr(patent_id, 1, 1) == "P"
drop if substr(patent_id, 1, 1) == "H"
drop if substr(patent_id, 1, 1) == "T"
* Drop RE patents that are design patents
drop if patent_type == "design"
drop patent_type

* Create numeric version of patent number
gen byte reissue = 0
replace reissue = 1 if substr(patent_id, 1, 1) == "R"
label variable reissue "Reissue (binary)"
gen patent_numeric = patent_id
replace patent_numeric = substr(patent_id, 3, .) if substr(patent_id, 1, 1) == "R"
destring patent_numeric, replace
label variable patent_numeric "Numeric Portion of Patent No."

* Merge in application data
merge 1:1 patent_id using "g_application.dta"
drop if _merge == 2
drop _merge patent_application_type series_code rule_47_flag
label variable application_id "Patent Appl. No. (PatentsView)"

* Convert date from string to date format
gen filing_date2 = date(filing_date, "YMD")
format filing_date2 %td
drop filing_date
rename filing_date2 filing_date
label variable filing_date "Filing Date (PatentsView)"

* Merge in terminal disclaimer data
merge 1:1 patent_id using "g_us_term_of_grant.dta"
drop if _merge == 2
drop _merge

* Generate new variable term_disc_pv indicating terminal disclaimer in PatentsView
* disclaimer_date = date of the terminal disclaimer
* term_disclaimer = disclaimer if the patent is subject to a terminal disclaimer
* (seems like only one of these is filled in for any given patent)
gen byte term_disc_pv = 0
replace term_disc_pv = 1 if disclaimer_date != ""
replace term_disc_pv = 1 if term_disclaimer != ""
label variable term_disc_pv "Terminal Disclaimer (PatentsView)"
label variable term_extension "Term Extension Days (PatentsView)"

* Clean up
drop disclaimer_date term_disclaimer term_grant
order patent_id reissue patent_numeric patent_date application_id filing_date term_disc_pv term_extension num_claims
recast int term_extension num_claims
sort patent_date reissue patent_numeric
save "term_disc_pv.dta", replace


********************************************************************
* Patent Examination Research Dataset (PatEx) Terminal Disclaimers *
********************************************************************

* https://www.uspto.gov/ip-policy/economic-research/research-datasets/patent-examination-research-dataset-public-pair
* Most recent 2022 release is based on data from the Patent Examination Data System (PEDS) in June 2023
* Download transactions.dta and application_data.dta
* Most recent issued patents are June 6, 2023, ending with 11672191 and RE49549

* Convert the transactions data to just have terminal disclaimer filing events
* 	DIST			Terminal Disclaimer Filed
* 	DISQ			Terminal Disclaimer Review Decision
use "transactions.dta", clear
keep if (event_code == "DIST") | (event_code == "DISQ")
drop if substr(application_number, 1, 3) == "PCT"
recast str8 application_number
format application_number %9s
sort application_number event_code recorded_date
duplicates drop application_number, force
drop event_code
gen byte term_disc_patex = 1
label variable term_disc_patex "Terminal Disclaimer (PatEx)"
label variable recorded_date "1st Term Disc Date (PatEx)"
save "transactions_term_disc.dta", replace

* Start with the application_data file
use "application_data.dta", clear
keep patent_number patent_issue_date application_number filing_date
merge 1:1 application_number using "transactions_term_disc.dta"
drop _merge

* Keep only issued utility patents
drop if patent_number == ""
rename patent_number patent_id
label variable patent_id "US Patent No. (unique)"
drop if substr(patent_id, 1, 1) == "D"
drop if substr(patent_id, 1, 1) == "P"
drop if substr(patent_id, 1, 1) == "H"
drop if substr(patent_id, 1, 1) == "T"
* Drop RE patents in PatEx that are actually design patents
drop if patent_id == "RE34182"
drop if patent_id == "RE34449"
drop if patent_id == "RE34867"
drop if patent_id == "RE35569"

* Create numeric version of patent number
gen byte reissue = 0
replace reissue = 1 if substr(patent_id, 1, 1) == "R"
label variable reissue "Reissue (binary)"
gen patent_numeric = patent_id
replace patent_numeric = substr(patent_id, 3, .) if substr(patent_id, 1, 1) == "R"
destring patent_numeric, replace
label variable patent_numeric "Numeric Portion of Patent No."

* Drop patents before Jan. 1, 1990
* 1990 started with patent 4890335 and reissue patent RE33138
* https://www.uspto.gov/web/offices/ac/ido/oeip/taf/issuyear.htm
drop if reissue == 0 & patent_numeric < 4890335
drop if reissue == 1 & patent_numeric < 33138

* Clean up
order patent_id reissue patent_numeric patent_issue_date application_number filing_date term_disc_patex recorded_date
rename application_number application_patex
label variable application_patex "Patent Appl. No. (PatEx)"
rename patent_issue_date patent_date_patex
label variable patent_date_patex "Patent Issue Date (PatEx)"
rename filing_date filing_date_patex
label variable filing_date_patex "Filing Date (PatEx)"
recast str8 application_patex
format application_patex %9s
sort patent_date_patex reissue patent_numeric
save "term_disc_patex.dta", replace


************************************
* Merge PatentsView and PatEx Data *
************************************

use "term_disc_pv.dta", clear
merge 1:1 patent_id using "term_disc_patex.dta"
gen source = "both"
replace source = "pv" if _merge == 1
replace source = "patex" if _merge == 2
label variable source "Data Source (PatentsView, PatEx, both)"
drop _merge

* Fill in patent issue date for 98 observations only in PatEx
replace patent_date = patent_date_patex if source == "patex"
* Manually check 8 where source=="both" but issue dates mismatch; for 3, PatEx is right
* Note there are 833K (almost 12%) where source=="both" and filing dates mismatch
* From spot check, seems like PatentsView has PCT filing date, PatEx has US filing; keep both
replace patent_date = patent_date_patex if patent_id=="5177313"
replace patent_date = patent_date_patex if patent_id=="5269569"
replace patent_date = patent_date_patex if patent_id=="6015565"
drop patent_date_patex

* Generate merged term_disc variable
replace term_disc_pv = 0 if term_disc_pv == .
replace term_disc_patex = 0 if term_disc_patex == .
gen byte term_disc = term_disc_pv
replace term_disc = 1 if term_disc_patex == 1
label variable term_disc "Terminal Disclaimer (merged)"

* Clean up
order patent_id reissue patent_numeric patent_date application_id filing_date application_patex filing_date_patex term_disc term_disc_pv term_disc_patex recorded_date source term_extension num_claims
sort patent_date reissue patent_numeric
save "term_disc_merged.dta", replace


*******************************
* Merge Litigated Patent Data *
*******************************

* Download 2020 file from USPTO website and rename as "litigated_patents.dta"
* https://www.uspto.gov/ip-policy/economic-research/research-datasets/patent-litigation-docket-reports-data
use "litigated_patents.dta", clear
rename patent patent_id
merge m:1 patent_id using "term_disc_merged.dta"
drop if _merge != 3
drop _merge
save "litigated_patents_term_disc.dta", replace
duplicates drop patent_id, force
sort patent_date reissue patent_numeric
save "litigated_patents_term_disc_no_dups.dta", replace


**************************
* Merge Orange Book Data *
**************************

* Download Ouellette & Sampat Orange Book data from Dataverse
* https://doi.org/10.7910/DVN/VI93T9
use "nda_govt_patents.dta", clear
drop gi coc report govtass register

* Drop 6 design patents
drop if substr(patent_id, 1, 1) == "D"

* Drop pre-1990 patents and ones after June 6, 2023
gen reissue = 0
replace reissue = 1 if substr(patent_id, 1, 1) == "R"
gen patent_numeric = patent_id
replace patent_numeric = substr(patent_id, 3, .) if substr(patent_id, 1, 1) == "R"
destring patent_numeric, replace
drop if reissue == 0 & patent_numeric < 4890335
drop if reissue == 1 & patent_numeric < 33138
* Most recent issued patents in PatEx are June 6, 2023, ending with 11672191 and RE49549
drop if reissue == 0 & patent_numeric > 11672191
drop if reissue == 1 & patent_numeric > 49549
drop reissue patent_numeric

* Merge terminal disclaimer data
merge m:1 patent_id using "term_disc_merged.dta"
drop if _merge != 3
* Note: 1 patents in our dataset (5677794) don't have a match; it is on list of withdrawn patents
* https://www.uspto.gov/patents/search/withdrawn-patent-numbers
drop _merge
save "nda_patents_term_disc.dta", replace
* This includes duplicate patent numbers
duplicates drop patent_id, force
sort patent_date reissue patent_numeric
save "nda_patents_term_disc_no_dups.dta", replace
